# Who Cares if You Vote? Partisan agreement and injunctive norms of voting
# Edward Fieldhouse (1), David Cutts (2), Jack Bailey (1)
# (1) The University of Manchester, (2) The University of Birmingham


# 1. Housekeeping ---------------------------------------------------------

# The following code requires that you have first opened the associated
# project file. If not, click the button on the top right and open it
# ("Open Project...").

# Note also that the Bayesian methods we use below rely on having installed
# the probabilistic programming language Stan. If you have yet to install
# Stan, please see https://mc-stan.org/users/interfaces/.

# Load packages

library(tidyverse)
library(tidybayes)
library(magrittr)
library(haven)
library(jbmisc)   # devtools::install_github("jackobailey/jbmisc)
library(here)


# Load data from the British Election Study Internet Panel. Note: If you
# plan to use this data, please make sure that you download the most recent
# version available at https://www.britishelectionstudy.com.

dta <- read_sav(here("_data", "BES2015_W4_v4.0.sav.zip"))



# 2. Transform data -------------------------------------------------------

# Load data and select variables

dta <-
  dta %>%
  select(
    # Respondent-level covariates
    id,
    wt,
    age = Age,
    female = gender,
    edu = edlevel,
    edu2 = education,
    married = marital,
    discuss = discussPolDays,
    att = polAttention,
    no_und = efficacyNotUnderstand,
    pol_care = efficacyPolCare,
    duty = dutyToVote2,
    pid = partyId,
    sq = partyIdSqueeze,
    str = partyIdStrength,
    turn = profile_turnout_2015,
    
    # Discussant-level covariates
    rel = num_range("relationshipName", 1:3),
    app = num_range("discussantApprovalVoteName", 1:3),
    vote = num_range("discussantVoteName", 1:3),
    pol = num_range("discussPolDaysD", 1:3)
  )


# Load randomisation variable

rnd <- read_dta(here("_data", "randvar_w4.dta.zip"))


# Merge randomisation variable into data and filter non-participating respondents

dta <- 
  dta %>% 
  left_join(rnd, by = "id") %>% 
  filter(rnd_var == 1)


# First, we'll deal with all of the respondent-level variables. To get started
# we'll start with age, which we're going to convert to z-scores to help the
# Bayesian models we fit later converge.

dta$age <-
  dta$age %>% 
  mark_na(-8:-9) %>% 
  as.numeric() %>% 
  rescale(type = "z")


# Next, we'll convert the "female" variable to a factor that takes the value 1
# where the respondent is female and 0 where they are male.

dta$female <-
  dta$female %>%
  as_factor()


# We now need to deal with our two education variables. We need "edu2" only as
# it contains information on who has some other qualification and whose are not
# known. We'll add these categories to "edu1", then populate them from "edu2".

dta$edu <-
  dta$edu %>%
  as_factor() %>%
  factor(levels = c("No qualifications",
                    "Below GCSE",
                    "GCSE",
                    "A-level",
                    "Undergraduate",
                    "Postgrad",
                    "Other",
                    "Unknown"),
         labels = c("None",
                    "Below GCSE",
                    "GCSE",
                    "A-level",
                    "Undergraduate",
                    "Postgraduate",
                    "Other",
                    "Unknown"))

dta$edu[dta$edu2 == 18] <- "Other"
dta$edu[is.na(dta$edu) == T] <- "Unknown"


# Now we'll move on to marital status. We want to convert this to a new variable
# that tells us if the respondent is cohabiting. This is because we expect similar
# "marriage" effects for those who are married, living as married, or in civil
# partnerships.

dta$cohabit <-
  dta$married %>%
  as_factor() %>%
  as_dummy(c("Married", "Living as married", "Civil Partnership")) %>%
  factor(labels = c("Other",
                    "Cohabiting"))


# There are now a host of attitudinal variables that we must deal with. These each
# have roughly the same issues. And, in most cases, we can simply mark "Don't know"
# cases as missing, convert them to numeric, and include them in our model. We'll
# do them all at once.

dta$discuss <-
  dta$discuss %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>% 
  subtract(1)


dta$att <-
  dta$att %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>% 
  subtract(1)


dta$no_und <-
  dta$no_und %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>%
  subtract(1)


dta$pol_care <-
  dta$pol_care %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>%
  subtract(1)


dta$duty <- 
  dta$duty %>% 
  as_factor() %>% 
  mark_na("Don't know") %>% 
  as.numeric() %>% 
  subtract(1)


# Now we have to convert the turnout variable to a dummy that takes the value 1
# where the respondent said that they voted and 0 otherwise.

dta$turn <-
  dta$turn %>%
  as_factor() %>% 
  as_dummy("Yes, voted") %>%
  factor(labels = c("Didn't vote", "Voted"))


# Next we have to create a party identification variable that combines the two
# pid variables in our data. First, we use the responses from the squeeze variable
# "sq" to populate our main pid variable where respondents said that they identified
# with no party (10) or did not know (9999). We then convert the resulting variable
# to a factor so that we can include it in our model.

dta$pid[dta$pid %in% c(10, 9999)] <- dta$sq[dta$pid %in% c(10, 9999)]

# Party ID
dta$pid <-
  dta$pid %>%
  as_factor() %>%
  droplevels() %>% 
  factor(labels = c("Conservative",
                    "Labour",
                    "Liberal Democrat",
                    "SNP",
                    "Plaid Cymru",
                    "UKIP",
                    "Green Party",
                    "BNP",
                    "Other",
                    "None",
                    "Don't know"))


# Finally for our respondent-level data, we need to recode the party identification strength
# variable to include those respondents with no party identification, whichb we'll also set
# as the reference category. We'll also mark any cases where the respondent says that they
# "Don't know" how strongly they identified as NA.

dta$str <- 
  dta$str %>% 
  as_factor() %>% 
  mark_na("Don't know") %>% 
  fct_explicit_na("None") %>% 
  fct_relevel("None")



# Next, we must move on to the discussant-level covariates. These are tricky, so the code might
# get quite long below. First, we'll start by converting the dyadic relationship variables to
# factors using the lapply() function.

dta[, paste0("rel", 1:3)] <-
  dta[, paste0("rel", 1:3)] %>%
  lapply(as_factor)


# Now we'll convert the discussants' voting variable so that it takes the same values as the
# respondent party identification variable.

dta[, paste0("vote", 1:3)] <-
  dta[, paste0("vote", 1:3)] %>%
  lapply(as_factor) %>%
  lapply(droplevels) %>% 
  lapply(factor, labels = c("Labour",
                            "Conservative",
                            "Liberal Democrat",
                            "SNP",
                            "Plaid Cymru",
                            "Green Party",
                            "UKIP",
                            "BNP",
                            "Other",
                            "None",
                            "Don't know"))


# Next, we need to create a count of discussants for each respondent. We can do this by adding
# up how many NAs the respondent has across the 3 "rel" columns, then subtracting the answer
# from 3 -- the maximum number of discussants.

dta$n_disc <- 3 - rowSums(is.na(dta[, paste0("rel", 1:3)]) == T)


# Convert to data.frame
dta <- dta %>% data.frame()


# One complicating factor is that some respondents report fewer than 3 discussants, but do not
# necessary put those that they do report in the first slot. Sometimes, for example, respondents
# with 2 discussants will use the 2nd and 3rd slot, or the 1st and 3rd slot. We need to deal with
# this and make sure that where the respondent has fewer than 3 discussants, they always appear in
# the respective column. E.g. a respondent with 1 discussant will have responses in only column 1
# and a respondent with 2 discussants will have responses in only columns 1 and 2. As you can see
# from the output below this is not currently the case. This takes the form of a lot of recoding.

dta[, paste0("rel", 1:3)][dta$n_disc == 2 & is.na(dta$rel1) == T, ] <-
  tibble(rel1 = dta$rel2[dta$n_disc == 2 & is.na(dta$rel1) == T],
         rel2 = dta$rel3[dta$n_disc == 2 & is.na(dta$rel1) == T],
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 2 & is.na(dta$vote1) == T, ] <-
  tibble(vote1 = dta$vote2[dta$n_disc == 2 & is.na(dta$vote1) == T],
         vote2 = dta$vote3[dta$n_disc == 2 & is.na(dta$vote1) == T],
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 2 & is.na(dta$app1) == T, ] <-
  tibble(app1 = dta$app2[dta$n_disc == 2 & is.na(dta$app1) == T],
         app2 = dta$app3[dta$n_disc == 2 & is.na(dta$app1) == T],
         app3 = NA)

dta[, paste0("rel", 1:3)][dta$n_disc == 2 & is.na(dta$rel2) == T, ] <-
  tibble(rel1 = dta$rel1[dta$n_disc == 2 & is.na(dta$rel2) == T],
         rel2 = dta$rel3[dta$n_disc == 2 & is.na(dta$rel2) == T],
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 2 & is.na(dta$vote2) == T, ] <-
  tibble(vote1 = dta$vote1[dta$n_disc == 2 & is.na(dta$vote2) == T],
         vote2 = dta$vote3[dta$n_disc == 2 & is.na(dta$vote2) == T],
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 2 & is.na(dta$app2) == T, ] <-
  tibble(app1 = dta$app1[dta$n_disc == 2 & is.na(dta$app2) == T],
         app2 = dta$app3[dta$n_disc == 2 & is.na(dta$app2) == T],
         app3 = NA)

dta[, paste0("rel", 1:3)][dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel2) == T, ] <-
  tibble(rel1 = dta$rel3[dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel2) == T],
         rel2 = NA,
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote2) == T, ] <-
  tibble(vote1 = dta$vote3[dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote2) == T],
         vote2 = NA,
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app2) == T, ] <-
  tibble(app1 = dta$app3[dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app2) == T],
         app2 = NA,
         app3 = NA)

dta[, paste0("rel", 1:3)][dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel3) == T, ] <-
  tibble(rel1 = dta$rel2[dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel3) == T],
         rel2 = NA,
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote3) == T, ] <-
  tibble(vote1 = dta$vote2[dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote3) == T],
         vote2 = NA,
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app3) == T, ] <-
  tibble(app1 = dta$app2[dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app3) == T],
         app2 = NA,
         app3 = NA)


# Finally, we need to recode our main independent variable -- discussant approval.
# This is a little bit tricky too. First, we'll recode these variables into dummy
# variables that take the value 1 where the respondent said that the discussant
# would care if they voted and 0 otherwise. We will include the original "app"
# variables at the 2nd level, but need to create a new variable to capture the
# main effect at the individual level while accounting for the number of discussants
# that each respondent has. We do this by calculating the weighted average of the
# "app" variables.

dta[, paste0("app", 1:3)] <-
  dta[, paste0("app", 1:3)] %>%
  lapply(as_factor) %>% 
  lapply(as_dummy, "Yes")

dta$app_fix <- rowMeans(dta[, paste0("app", 1:3)])


# Another tricky bit, we need to create new variables that tell us about the nature of
# each respondents-discussant dyad's partisanship. We can do this using the case_when()
# function.

opt <- c("None", "Don't know")

dta <-
  dta %>%
  mutate(
    prt1 = 
      case_when(
        pid == vote1 & !(pid %in% opt) & !(vote1 %in% opt) & pid != "Other" & vote1 != "Other" & is.na(rel1) == F ~ "Shared Partisanship",
        pid != vote1 & !(pid %in% opt) & !(vote1 %in% opt) & is.na(rel1) == F ~ "Opposing Partisanship",
        pid == "Other" & vote1 == "Other" & is.na(rel1) == F ~ "Opposing Partisanship",
        pid %in% opt & !(vote1 %in% opt) & is.na(rel1) == F ~ "Respondent Non-partisan",
        vote1 %in% opt & !(pid %in% opt) & is.na(rel1) == F ~ "Discussant Non-partisan",
        pid %in% opt & vote1 %in% opt & is.na(rel1) == F ~ "Shared Non-partisan"),
    prt2 = 
      case_when(
        pid == vote2 & !(pid %in% opt) & !(vote2 %in% opt) & pid != "Other" & vote2 != "Other" & is.na(rel2) == F ~ "Shared Partisanship",
        pid != vote2 & !(pid %in% opt) & !(vote2 %in% opt) & is.na(rel2) == F ~ "Opposing Partisanship",
        pid == "Other" & vote2 == "Other" & is.na(rel2) == F ~ "Opposing Partisanship",
        pid %in% opt & !(vote2 %in% opt) & is.na(rel2) == F ~ "Respondent Non-partisan",
        vote2 %in% opt & !(pid %in% opt) & is.na(rel2) == F ~ "Discussant Non-partisan",
        pid %in% opt & vote2 %in% opt & is.na(rel2) == F ~ "Shared Non-partisan"),
    prt3 =
      case_when(
        pid == vote3 & !(pid %in% opt) & !(vote3 %in% opt) & pid != "Other" & vote3 != "Other" & is.na(rel3) == F ~ "Shared Partisanship",
        pid != vote3 & !(pid %in% opt) & !(vote3 %in% opt) & is.na(rel3) == F ~ "Opposing Partisanship",
        pid == "Other" & vote3 == "Other" & is.na(rel3) == F ~ "Opposing Partisanship",
        pid %in% opt & !(vote3 %in% opt) & is.na(rel3) == F ~ "Respondent Non-partisan",
        vote3 %in% opt & !(pid %in% opt) & is.na(rel3) == F ~ "Discussant Non-partisan",
        pid %in% opt & vote3 %in% opt & is.na(rel3) == F ~ "Shared Non-partisan")
  )


plot1 <- 
  dta %>% 
  group_by(n_disc) %>%
  summarize(n = sum(wt, na.rm = T)) %>%
  ungroup() %>% 
  mutate(
    n = n/sum(n)
  ) %>% 
  ggplot(
    aes(
      x = n_disc,
      y = n
    )
  ) +
  geom_col(colour = bailey_colours("blue"),
           fill = bailey_colours("blue5")) +
  geom_text(aes(y = n,
                label = paste0(format(round(n, 3)*100, nsmall = 1), "%")
  ),
  family = "Cabin",
  size = 2,
  vjust = -1) +
  scale_y_continuous(limits = c(0, .5),
                     labels = scales::percent_format(accuracy = 1)) +
  labs(
    y = "",
    x = "No. Discussants"
  ) +
  theme_bailey() +
  theme(title = element_blank())



# Finally, we'll select only the variables we need for our analysis, then remove any missing
# data using listwise deletion.

dta <-
  dta %>%
  select(wt,
         age,
         female,
         edu,
         cohabit,
         discuss,
         att,
         no_und,
         pol_care,
         duty,
         str,
         turn,
         app_fix,
         pid,
         vote_ = num_range("vote", 1:3),
         num_range("rel", 1:3),
         num_range("app", 1:3),
         prt_ = num_range("prt", 1:3),
         num_range("w", 1:3)) %>% 
  na.omit() %>% 
  select(wt,
         pid,
         num_range("prt_", 1:3),
         num_range("vote_", 1:3)) %>% 
  mutate(id = 1:n()) %>% 
  reshape(.,
          varying = paste0(rep(c("prt_", "vote_"), each = 3), 1:3),
          sep = "_",
          direction = "long")



# 3. Create "Heat Map Style" Plot ----------

dta <- 
  dta %>% 
  mutate(
    pid =
      pid %>% 
      factor(
        levels =
          c("Conservative",
            "Labour",
            "Liberal Democrat",
            "SNP",
            "Plaid Cymru",
            "UKIP",
            "Green Party",
            "BNP",
            "Other",
            "None",
            "Don't know"),
        labels =
          c("Con",
            "Lab",
            "Lib",
            "SNP",
            "PC",
            "UKIP",
            "Grn",
            "BNP",
            "Oth",
            "Non",
            "DK")
      ),
    vote =
      vote %>% 
      factor(
        levels =
          c("Conservative",
            "Labour",
            "Liberal Democrat",
            "SNP",
            "Plaid Cymru",
            "UKIP",
            "Green Party",
            "BNP",
            "Other",
            "None",
            "Don't know"),
        labels =
          c("Con",
            "Lab",
            "Lib",
            "SNP",
            "PC",
            "UKIP",
            "Grn",
            "BNP",
            "Oth",
            "Non",
            "DK")
      )
  ) %>% 
  group_by(vote, pid) %>%
  summarize(n = sum(wt)) %>%
  spread(pid, n) %>% 
  janitor::adorn_percentages(denominator = "all") %>% 
  pivot_longer(cols = c(Con, Lab, Lib, SNP, PC, UKIP, Grn, BNP, Oth, Non, DK),
               names_to = "pid",
               values_to = "pct") %>% 
  mutate(
    pid =
      pid %>% 
      factor(
        levels =
          c("DK",
            "Non",
            "Oth",
            "BNP",
            "Grn",
            "UKIP",
            "PC",
            "SNP",
            "Lib",
            "Lab",
            "Con")
      ),
    prt =
      case_when(
        pid == vote & !(pid %in% c("Non", "DK")) & !(vote %in% c("Non", "DK")) & pid != "Oth" & vote != "Oth" & is.na(rel) == F ~ "Shared Partisanship",
        pid != vote & !(pid %in% c("Non", "DK")) & !(vote %in% c("Non", "DK")) & is.na(rel) == F ~ "Opposing Partisanship",
        pid == "Oth" & vote == "Oth" & is.na(rel) == F ~ "Opposing Partisanship",
        pid %in% c("Non", "DK") & !(vote %in% c("Non", "DK")) & is.na(rel) == F ~ "Respondent Non-partisan",
        vote %in% c("Non", "DK") & !(pid %in% c("Non", "DK")) & is.na(rel) == F ~ "Discussant Non-partisan",
        pid %in% c("Non", "DK") & vote %in% c("Non", "DK") & is.na(rel) == F ~ "Shared Non-partisan"
      ) %>% 
      factor(
        levels =
          c(
            "Shared Partisanship",
            "Opposing Partisanship",
            "Respondent Non-partisan",
            "Discussant Non-partisan",
            "Shared Non-partisan"
          ),
        labels = 
          c(
            "Shared\nPartisanship",
            "Opposing\nPartisanship",
            "Respondent\nNonpartisan",
            "Discussant\nNonpartisan",
            "Shared\nNonpartisan"
          )
      ),
    pct = ifelse(is.na(pct), 0, pct)
  )


plot2 <- 
  dta %>% 
  ggplot(
    aes(
      x = vote,
      y = pid,
      fill = prt,
      label = ifelse(round(pct, digits = 3)*100 < .1,
                     "0",
                     as.character(round(pct, digits = 3)*100))
    )
  ) +
  geom_tile(colour = "black",
            size = 0.5) +
  geom_text(color = ifelse(dta$prt == "Shared\nNonpartisan", "white", "black"),
            size = 2,
            family = "Cabin") +
  scale_x_discrete(position = "top") +
  scale_fill_manual(
    values =
      bailey_colours(
        "white",
        "grey7",
        "blue10",
        "red10",
        "black"
      )
  ) +
  coord_fixed() +
  labs(
    y = "Respondent: Party ID",
    x = "Discussant: Expected Vote"
  ) +
  theme_bailey() +
  theme(legend.position = "bottom",
        legend.box.margin = margin(-6, -10, -2, -10),
        legend.key.size = unit(.3, "cm"),
        legend.title = element_blank(),
        legend.text = element_text(size = rel(.6), hjust = .5),
        title = element_blank(),
        axis.title.x.top = element_text(margin = margin(b = 7)),
        axis.title.y.left = element_text(margin = margin(r = 7)),
        axis.ticks.x = element_blank(),
        axis.line = element_blank())


plot <- ggpubr::ggarrange(plot1, plot2,
                          widths = c(1, 1.618))

# And, finally, we'll export this plot as a jpeg

jpeg(file = here("_output", "crosstab_w4.jpeg"),
     res = 300,
     width = 5.97,
     height = 3.69,
     units = "in")
plot
dev.off()
